@pi-unipi/web-api 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -7,15 +7,17 @@ Web search, read, and summarize tools with provider-based backend selection for
7
7
  `@pi-unipi/web-api` provides agent tools for web access:
8
8
 
9
9
  - **web_search** — Search the web using various providers
10
- - **web_read** — Extract content from URLs
10
+ - **multi_web_content_read** — Extract content from URLs using the local smart-fetch engine (default) or provider fallbacks
11
11
  - **web_llm_summarize** — Summarize web content using LLM
12
12
 
13
- Providers are ranked by capability and cost, allowing smart auto-selection.
13
+ The read path uses a **smart-fetch engine** by default free, local, no API key required.
14
14
 
15
15
  ## Features
16
16
 
17
+ - **Smart-Fetch Engine** — Local content extraction with browser-grade TLS fingerprinting
17
18
  - **Provider-based architecture** — Multiple search/read providers with unified interface
18
19
  - **Smart selection** — Auto-select cheapest available provider
20
+ - **Batch reading** — Fetch multiple URLs concurrently with progress
19
21
  - **API key management** — Interactive TUI for key configuration
20
22
  - **Caching** — Web content cached with configurable TTL
21
23
  - **Subagent integration** — Tools automatically available to spawned subagents
@@ -38,6 +40,24 @@ Add to your pi configuration:
38
40
  }
39
41
  ```
40
42
 
43
+ ## Smart-Fetch Engine
44
+
45
+ The smart-fetch engine is a local content extraction pipeline:
46
+
47
+ | Component | Purpose |
48
+ |-----------|---------|
49
+ | **wreq-js** | Browser-grade TLS fingerprinting (bypasses Cloudflare) |
50
+ | **defuddle** | Intelligent content extraction from HTML |
51
+ | **linkedom** | Server-side DOM parsing |
52
+
53
+ **Features:**
54
+ - No API key required
55
+ - Browser-level anti-bot bypass
56
+ - Clean markdown output with metadata (title, author, site, word count)
57
+ - Batch concurrent fetching with progress
58
+ - Client-side meta redirect following
59
+ - Multiple output formats (markdown, HTML, text, JSON)
60
+
41
61
  ## Providers
42
62
 
43
63
  ### Search Providers
@@ -54,10 +74,13 @@ Add to your pi configuration:
54
74
 
55
75
  | Provider | Rank | Cost | API Key |
56
76
  |----------|------|------|---------|
77
+ | **Smart-Fetch Engine** | 0 | Free | No |
57
78
  | Jina AI Reader | 1 | Freemium | Optional |
58
79
  | Firecrawl | 2 | Paid | Required |
59
80
  | Perplexity | 3 | Paid | Required |
60
81
 
82
+ **Note:** Rank 0 is the smart-fetch engine (default). Provider fallbacks are available via `source` parameter.
83
+
61
84
  ### Summarize Providers
62
85
 
63
86
  | Provider | Rank | Cost | API Key |
@@ -85,10 +108,18 @@ export PERPLEXITY_API_KEY="your-key"
85
108
  export JINA_API_KEY="your-key"
86
109
  ```
87
110
 
111
+ ### Smart-Fetch Defaults
112
+
113
+ Configure default browser profile, OS, max chars, timeout, etc. via:
114
+
115
+ ```
116
+ /unipi:web-settings → "Smart Fetch Defaults"
117
+ ```
118
+
88
119
  ### Settings Files
89
120
 
90
121
  - **Auth:** `~/.unipi/config/web-api/auth.json` (API keys, gitignored)
91
- - **Config:** `~/.unipi/config/web-api/config.json` (provider settings)
122
+ - **Config:** `~/.unipi/config/web-api/config.json` (provider settings, smart-fetch defaults)
92
123
 
93
124
  ## Usage
94
125
 
@@ -102,14 +133,23 @@ web_search(query: "TypeScript generics")
102
133
  web_search(query: "latest AI research", source: 4) # Tavily
103
134
  ```
104
135
 
105
- ### Web Read
136
+ ### Multi Web Content Read
106
137
 
107
138
  ```
108
- # Auto-select provider
109
- web_read(url: "https://example.com/article")
139
+ # Single URL (uses smart-fetch engine by default)
140
+ multi_web_content_read(url: "https://example.com/article")
110
141
 
111
- # Use specific provider
112
- web_read(url: "https://example.com/spa", source: 2) # Firecrawl
142
+ # Batch URLs
143
+ multi_web_content_read(url: ["https://example.com/a", "https://example.com/b"])
144
+
145
+ # Use provider fallback (Jina Reader)
146
+ multi_web_content_read(url: "https://example.com/article", source: 1)
147
+
148
+ # Custom options
149
+ multi_web_content_read(url: "https://example.com/article", format: "json", maxChars: 10000)
150
+
151
+ # Advanced: custom browser profile
152
+ multi_web_content_read(url: "https://example.com/article", browser: "chrome_145", os: "windows")
113
153
  ```
114
154
 
115
155
  ### Web Summarize
@@ -126,10 +166,11 @@ web_llm_summarize(url: "https://example.com/research", prompt: "Extract key find
126
166
 
127
167
  ### /unipi:web-settings
128
168
 
129
- Interactive settings dialog for managing providers and API keys.
169
+ Interactive settings dialog for managing providers, API keys, and smart-fetch defaults.
130
170
 
131
- - **Auto-enable on key input** — provider is automatically enabled when you add a valid API key (no extra toggle step)
132
- - **Cursor memory** — last configured provider moves to the top of the list when you return to the menu
171
+ - **Auto-enable on key input** — provider is automatically enabled when you add a valid API key
172
+ - **Smart-fetch configuration** — set default browser, OS, timeout, etc.
173
+ - **Cursor memory** — last configured provider moves to the top of the list
133
174
 
134
175
  ### /unipi:web-cache-clear
135
176
 
@@ -139,7 +180,18 @@ Clear all cached web content.
139
180
 
140
181
  - Default TTL: 1 hour
141
182
  - Cache location: `~/.unipi/config/web-api/cache/`
142
- - Automatic for web_read operations
183
+ - Smart-fetch cache keys include URL + browser + format + maxChars
184
+ - Automatic for all read operations
185
+
186
+ ## Dependencies
187
+
188
+ | Package | Version | Purpose |
189
+ |---------|---------|---------|
190
+ | wreq-js | ^2.3.0 | TLS fingerprinting |
191
+ | defuddle | ^0.18.1 | Content extraction |
192
+ | linkedom | ^0.18.12 | Server-side DOM |
193
+ | lodash | ^4.17.21 | Filename sanitization |
194
+ | mime-types | ^2.1.35 | MIME type mapping |
143
195
 
144
196
  ## Troubleshooting
145
197
 
@@ -151,6 +203,14 @@ If you see "No search provider available":
151
203
  2. Add API keys for paid providers (they auto-enable on key input)
152
204
  3. Or manually enable a free provider
153
205
 
206
+ ### Smart-fetch fails
207
+
208
+ If smart-fetch fails to extract content:
209
+
210
+ 1. Try a different browser profile: `browser: "chrome_133"`
211
+ 2. Try a provider fallback: `source: 1` (Jina Reader)
212
+ 3. Check if the site requires JavaScript execution (not supported)
213
+
154
214
  ### API key invalid
155
215
 
156
216
  If API key validation fails:
@@ -164,14 +224,15 @@ If API key validation fails:
164
224
  If you hit rate limits:
165
225
 
166
226
  1. Add an API key for higher limits
167
- 2. Use a different provider
168
- 3. Wait and retry
227
+ 2. Use the smart-fetch engine (default, no limits)
228
+ 3. Use a different provider
229
+ 4. Wait and retry
169
230
 
170
231
  ## Development
171
232
 
172
233
  ```bash
173
234
  # Type check
174
- npm run typecheck
235
+ npx tsc --noEmit
175
236
 
176
237
  # Build
177
238
  npm run build
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pi-unipi/web-api",
3
- "version": "0.1.13",
3
+ "version": "0.1.15",
4
4
  "description": "Web search, read, and summarize tools with provider-based backend selection for Pi coding agent",
5
5
  "type": "module",
6
6
  "license": "MIT",
@@ -38,13 +38,20 @@
38
38
  "README.md"
39
39
  ],
40
40
  "dependencies": {
41
- "@pi-unipi/core": "*"
41
+ "@pi-unipi/core": "*",
42
+ "defuddle": "^0.18.1",
43
+ "linkedom": "^0.18.12",
44
+ "lodash": "^4.17.21",
45
+ "mime-types": "^2.1.35",
46
+ "wreq-js": "^2.3.0"
42
47
  },
43
48
  "peerDependencies": {
44
49
  "@mariozechner/pi-coding-agent": "*",
50
+ "@mariozechner/pi-tui": "*",
45
51
  "@sinclair/typebox": "*"
46
52
  },
47
53
  "devDependencies": {
54
+ "@types/lodash": "^4.17.24",
48
55
  "@types/node": "^25.6.0"
49
56
  }
50
57
  }
@@ -5,7 +5,7 @@ description: "Web search, read, and summarize tools with provider-based backend"
5
5
 
6
6
  # Web Tools
7
7
 
8
- Use these tools to access web content. Providers are ranked by capability and cost.
8
+ Use these tools to access web content. The read path uses a local smart-fetch engine by default — free, fast, and no API key required.
9
9
 
10
10
  ## web_search
11
11
 
@@ -24,21 +24,43 @@ web_search(query: "TypeScript generics tutorial")
24
24
  web_search(query: "latest AI research", source: 4) # Use Tavily
25
25
  ```
26
26
 
27
- ## web_read
27
+ ## multi_web_content_read
28
28
 
29
- Read URL content. Lower `source` = simpler providers.
29
+ Read and extract content from URLs. Uses the **smart-fetch engine** by default (source=0 or omitted) — free, local, no API key required. Supports single URL or batch URLs.
30
30
 
31
- - **Basic extraction:** source 1 (Jina Reader)
32
- - **Advanced crawling:** source 2 (Firecrawl)
31
+ **Default behavior (source=0):**
32
+ - Browser-grade TLS fingerprinting via wreq-js
33
+ - Intelligent content extraction via defuddle
34
+ - Returns clean markdown with metadata (title, author, site, word count)
35
+ - No API key required
33
36
 
34
37
  **Parameters:**
35
- - `url` (required): URL to read
36
- - `source` (optional): Provider selection (1-3)
38
+ - `url` (required): Single URL string or array of URLs for batch
39
+ - `source` (optional): Provider selection (0=smart-fetch, 1=Jina Reader, 2=Firecrawl, 3=Perplexity)
40
+ - `browser` (optional): TLS fingerprint profile (default: chrome_145)
41
+ - `os` (optional): OS fingerprint (default: windows)
42
+ - `format` (optional): Output format — markdown, html, text, json (default: markdown)
43
+ - `maxChars` (optional): Maximum content characters (default: 50000)
44
+ - `timeoutMs` (optional): Request timeout in ms (default: 15000)
45
+ - `removeImages` (optional): Strip image references (default: false)
46
+ - `includeReplies` (optional): Include comments/replies (default: extractors)
47
+ - `proxy` (optional): Proxy URL
48
+ - `batchConcurrency` (optional): Concurrent requests for batch (default: 8)
49
+ - `verbose` (optional): Include metadata header (default: true)
37
50
 
38
51
  **Examples:**
39
52
  ```
40
- web_read(url: "https://example.com/article")
41
- web_read(url: "https://example.com/spa", source: 2) # Use Firecrawl
53
+ # Single URL (uses smart-fetch engine by default)
54
+ multi_web_content_read(url: "https://example.com/article")
55
+
56
+ # Batch URLs
57
+ multi_web_content_read(url: ["https://example.com/a", "https://example.com/b"])
58
+
59
+ # Use provider fallback (Jina Reader)
60
+ multi_web_content_read(url: "https://example.com/article", source: 1)
61
+
62
+ # Custom options
63
+ multi_web_content_read(url: "https://example.com/article", format: "json", maxChars: 10000)
42
64
  ```
43
65
 
44
66
  ## web_llm_summarize
@@ -61,7 +83,7 @@ web_llm_summarize(url: "https://example.com/research", prompt: "Extract key find
61
83
 
62
84
  ## Provider Selection
63
85
 
64
- - Omit `source` for auto-selection (cheapest available)
86
+ - Omit `source` for auto-selection (smart-fetch engine for read, cheapest for search)
65
87
  - Specify `source` number for specific provider
66
88
  - If provider unavailable, tool throws descriptive error
67
89
 
@@ -75,6 +97,7 @@ web_llm_summarize(url: "https://example.com/research", prompt: "Extract key find
75
97
  5. Perplexity (paid)
76
98
 
77
99
  **Read providers:**
100
+ 0. **Smart-Fetch Engine** (free, local) — default
78
101
  1. Jina AI Reader (freemium)
79
102
  2. Firecrawl (paid)
80
103
  3. Perplexity (paid)
@@ -83,8 +106,27 @@ web_llm_summarize(url: "https://example.com/research", prompt: "Extract key find
83
106
  1. Perplexity (paid)
84
107
  2. LLM Summarize (uses pi's LLM)
85
108
 
109
+ ## Smart-Fetch Engine
110
+
111
+ The smart-fetch engine is a local content extraction pipeline:
112
+
113
+ - **wreq-js**: Browser-grade TLS fingerprinting (bypasses Cloudflare, etc.)
114
+ - **defuddle**: Intelligent content extraction from HTML
115
+ - **linkedom**: Server-side DOM parsing
116
+
117
+ **Features:**
118
+ - No API key required
119
+ - Browser-level anti-bot bypass
120
+ - Clean markdown output with metadata
121
+ - Batch concurrent fetching with progress
122
+ - Client-side meta redirect following
123
+ - Multiple output formats
124
+
125
+ **Configure defaults** via `/unipi:web-settings` → "Smart Fetch Defaults"
126
+
86
127
  ## Cost Awareness
87
128
 
129
+ - **Smart-Fetch Engine:** Free (read only, no API key)
88
130
  - **DuckDuckGo:** Free (search only)
89
131
  - **Jina:** Freemium (search + read)
90
132
  - **SerpAPI/Tavily:** Paid (search)
@@ -98,6 +140,7 @@ Configure providers via `/unipi:web-settings` command.
98
140
 
99
141
  - Add/remove API keys
100
142
  - Enable/disable providers
143
+ - Configure smart-fetch defaults
101
144
  - View provider status
102
145
 
103
146
  ## Cache
@@ -105,4 +148,4 @@ Configure providers via `/unipi:web-settings` command.
105
148
  Web content is cached for 1 hour by default.
106
149
 
107
150
  - Clear cache: `/unipi:web-cache-clear`
108
- - Cache is automatic for web_read operations
151
+ - Cache includes smart-fetch results (keyed by URL + browser + format + maxChars)
@@ -0,0 +1,36 @@
1
+ /**
2
+ * @unipi/web-api — Engine Constants
3
+ *
4
+ * Default values for the smart-fetch engine.
5
+ */
6
+
7
+ /** Default browser TLS fingerprint profile */
8
+ export const DEFAULT_BROWSER = "chrome_145";
9
+
10
+ /** Default OS fingerprint */
11
+ export const DEFAULT_OS = "windows";
12
+
13
+ /** Default maximum content length in characters */
14
+ export const DEFAULT_MAX_CHARS = 50000;
15
+
16
+ /** Default request timeout in milliseconds */
17
+ export const DEFAULT_TIMEOUT_MS = 15000;
18
+
19
+ /** Default batch concurrency */
20
+ export const DEFAULT_BATCH_CONCURRENCY = 8;
21
+
22
+ /** Default removeImages setting */
23
+ export const DEFAULT_REMOVE_IMAGES = false;
24
+
25
+ /** Default includeReplies setting */
26
+ export const DEFAULT_INCLUDE_REPLIES: boolean | "extractors" = "extractors";
27
+
28
+ /** Default output format */
29
+ export const DEFAULT_FORMAT = "markdown" as const;
30
+
31
+ /** Default HTTP headers */
32
+ export const DEFAULT_HEADERS: Record<string, string> = {
33
+ Accept:
34
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
35
+ "Accept-Language": "en-US,en;q=0.9",
36
+ };
@@ -0,0 +1,145 @@
1
+ /**
2
+ * @unipi/web-api — Runtime Dependencies
3
+ *
4
+ * Lazy-loaded dependencies for the smart-fetch engine.
5
+ * Uses dynamic imports to handle optional native binding failures gracefully.
6
+ */
7
+
8
+ let wreqModule: any = null;
9
+ let defuddleModule: any = null;
10
+ let lodashModule: any = null;
11
+ let mimeTypesModule: any = null;
12
+
13
+ /**
14
+ * Get the wreq-js module.
15
+ * Throws a helpful error if the module is not available.
16
+ *
17
+ * @returns wreq-js module
18
+ */
19
+ export async function getWreq(): Promise<any> {
20
+ if (wreqModule) {
21
+ return wreqModule;
22
+ }
23
+
24
+ try {
25
+ // Use dynamic import for ESM compatibility
26
+ wreqModule = await import("wreq-js");
27
+ return wreqModule;
28
+ } catch (error) {
29
+ throw new Error(
30
+ `wreq-js is not available. ` +
31
+ `This is required for browser-grade TLS fingerprinting. ` +
32
+ `Run: npm install wreq-js\n` +
33
+ `Error: ${error instanceof Error ? error.message : String(error)}`
34
+ );
35
+ }
36
+ }
37
+
38
+ /**
39
+ * Get the defuddle module.
40
+ * Throws a helpful error if the module is not available.
41
+ *
42
+ * @returns defuddle module
43
+ */
44
+ export async function getDefuddle(): Promise<any> {
45
+ if (defuddleModule) {
46
+ return defuddleModule;
47
+ }
48
+
49
+ try {
50
+ defuddleModule = await import("defuddle");
51
+ return defuddleModule;
52
+ } catch (error) {
53
+ throw new Error(
54
+ `defuddle is not available. ` +
55
+ `This is required for intelligent content extraction. ` +
56
+ `Run: npm install defuddle\n` +
57
+ `Error: ${error instanceof Error ? error.message : String(error)}`
58
+ );
59
+ }
60
+ }
61
+
62
+ /**
63
+ * Get the lodash module.
64
+ *
65
+ * @returns lodash module
66
+ */
67
+ export async function getLodash(): Promise<any> {
68
+ if (lodashModule) {
69
+ return lodashModule;
70
+ }
71
+
72
+ try {
73
+ lodashModule = await import("lodash");
74
+ return lodashModule;
75
+ } catch (error) {
76
+ throw new Error(
77
+ `lodash is not available. ` +
78
+ `Run: npm install lodash\n` +
79
+ `Error: ${error instanceof Error ? error.message : String(error)}`
80
+ );
81
+ }
82
+ }
83
+
84
+ /**
85
+ * Get the mime-types module.
86
+ *
87
+ * @returns mime-types module
88
+ */
89
+ export async function getMimeTypes(): Promise<any> {
90
+ if (mimeTypesModule) {
91
+ return mimeTypesModule;
92
+ }
93
+
94
+ try {
95
+ mimeTypesModule = await import("mime-types");
96
+ return mimeTypesModule;
97
+ } catch (error) {
98
+ throw new Error(
99
+ `mime-types is not available. ` +
100
+ `Run: npm install mime-types\n` +
101
+ `Error: ${error instanceof Error ? error.message : String(error)}`
102
+ );
103
+ }
104
+ }
105
+
106
+ /**
107
+ * Check if all required dependencies are available.
108
+ *
109
+ * @returns true if all deps are available
110
+ */
111
+ export async function checkDependencies(): Promise<{
112
+ available: boolean;
113
+ missing: string[];
114
+ }> {
115
+ const missing: string[] = [];
116
+
117
+ try {
118
+ await getWreq();
119
+ } catch {
120
+ missing.push("wreq-js");
121
+ }
122
+
123
+ try {
124
+ await getDefuddle();
125
+ } catch {
126
+ missing.push("defuddle");
127
+ }
128
+
129
+ try {
130
+ await getLodash();
131
+ } catch {
132
+ missing.push("lodash");
133
+ }
134
+
135
+ try {
136
+ await getMimeTypes();
137
+ } catch {
138
+ missing.push("mime-types");
139
+ }
140
+
141
+ return {
142
+ available: missing.length === 0,
143
+ missing,
144
+ };
145
+ }